Hello everyone , welcome to birdcall detection competition , this is a tough one , especially for me because I have never worked with audio data. As we all know how augmentations tend to boost performance in computer vision tasks , audio related tasks are no different . As discussed in forum here and in many threads that augmentations and audio transforms will play a major role in this competition.
While we have a great library in the form of albumentations for Computer Vision Tasks , we don't have anything for audio data , but it will surely make our life easier if we can have something like that for audio , then we can quickly test different augs right??
Well since my baseline model's F1 score remains at zero both for training and validation due to some bug and I have been scratching my head from past 4 days ,I thought to make an audio-transform on top of albumenatations in the meantime to refresh my mind for anyone to use it directly just like our computer vision transforms
I have explained the use of each audio transform seperately and finally I have added an example to use the transforms as we use the albumentations transforms directly with pytorch dataset
This notebook is inspired by Alex's notebook here where he does the same thing for Text data . Many thanks to Alex for teaching me this
import os, random, pandas as pd, numpy as np, glob, pdb
import matplotlib.pyplot as plt
import cv2
import IPython.display as ipd
import librosa
from albumentations.core.transforms_interface import DualTransform, BasicTransform
import numpy as np
import pandas as pd
from pandas.api.types import is_string_dtype, is_numeric_dtype, is_categorical_dtype
import matplotlib.pyplot as plt
from termcolor import colored
from scipy import stats
import inspect, os, random
import operator as op
from functools import reduce
import IPython
from tabulate import tabulate
#####################################
# Color the background #
#####################################
def bg(value, type='num', color='blue'):
value = str('{:,}'.format(value)) if type == 'num' else str(value)
return colored(' ' + value + ' ', color, attrs=['reverse', 'blink'])
#####################################
# Print variable name #
#####################################
# Credits: https://stackoverflow.com/questions/18425225/getting-the-name-of-a-variable-as-a-string
def var2str(var):
"""
Gets the name of var. Does it from the out most frame inner-wards.
:param var: variable to get name from.
:return: string
"""
for fi in reversed(inspect.stack()):
names = [var_name for var_name, var_val in fi.frame.f_locals.items() if var_val is var]
if len(names) > 0:
return names[0]
##############################################
# Print Shape of multiple variables #
##############################################
def shape(*args):
all_dfs = []
for df in args:
if isinstance(df, list):
n_columns = bg(1)
n_rows = bg(len(df))
else:
n_columns = f"~{bg(1 if len(df.shape) <= 1 else df.shape[1])}"
n_rows = f"~{bg(df.shape[0])}"
all_dfs.append([var2str(df), str(n_rows), str(n_columns)])
print(tabulate(all_dfs, headers=["Dataframe", "n_rows", "n_columns"], tablefmt="grid"))
#####################################
# dd for debuging #
#####################################
def dd(*args):
print('--' * 20)
for x in args:
varName = colored(var2str(x), attrs=['blink'])
# Get the type of the variable.
try:
print(f"~> Type of {varName}: {colored(type(x), 'green')}")
except:
print(f"~> Can't get the {colored('type', 'green')} of {varName}")
# Get the shape of the variable.
try:
print(f"~> Shape of {varName}: {colored(str(x.shape), 'blue')}")
except:
print(f"~> Length of {varName}: {colored(str(len(x)), 'blue')}")
# Get the first value of the variable.
try:
print(f"~> First Value of {varName}: {x[0]}")
except:
if type(x) is type(pd.DataFrame()) or type(x) is type(pd.Series):
print(f"~> First Row of {varName}: \n\n{x.iloc[0]}")
elif type(x) is type(dict()):
print(f"~> Can't show the first value of a {colored('dictionary', 'red')}.")
print('--' * 20)
Note that the transforms are build in such a way that you can apply them on the audio data after reading it as a raw time series numpy array , In case you want the transform classes to read the audio file and then transform you can easily add a function in the main class below to read audio data from file path and then call this function in every other transform class to read data .
This implementation is done keeping in mind that you would want to apply these transform after cropping 5 min clips from audio which seem to work best till now as per the results shared in discussion forums from people who are the top of lb righ now
class AudioTransform(BasicTransform):
"""
Transorm for audio task. This is the main class where we override the
targets and update params function for our need
"""
@property
def targets(self):
return {"data": self.apply}
def update_params(self, params, **kwargs):
if hasattr(self, "interpolation"):
params["interpolation"] = self.interpolation
if hasattr(self, "fill_value"):
params["fill_value"] = self.fill_value
return params
Shift the start time of the audio by some margin
class TimeShifting(AudioTransform):
""" Do time shifting of audio """
def __init__(self, always_apply=False, p=0.5):
super(TimeShifting, self).__init__(always_apply, p)
def apply(self,data,**params):
'''
data : ndarray of audio timeseries
'''
start_ = int(np.random.uniform(-80000,80000))
if start_ >= 0:
audio_time_shift = np.r_[data[start_:], np.random.uniform(-0.001,0.001, start_)]
else:
audio_time_shift = np.r_[np.random.uniform(-0.001,0.001, -start_), data[:start_]]
return audio_time_shift
General Usage
audio_path = '../input/birdsong-recognition/train_audio/aldfly/XC181484.mp3'
y, sr = librosa.load(audio_path, sr=22050)
print("Audio Intially")
ipd.Audio(y,rate=sr)
transform = TimeShifting(p=1.0)
transformed_audio = transform(data=y)['data']
print("audio after transformation")
ipd.Audio(transformed_audio, rate=sr)
Incease or decrease the speed of the audio under consideration
class SpeedTuning(AudioTransform):
""" Do speed tunning of audio """
def __init__(self, always_apply=False, p=0.5, speed_rate=None):
'''
Give Rate between (0.5, 1.5) for best results.
'''
super(SpeedTuning, self).__init__(always_apply, p)
self.speed_rate = speed_rate or np.random.uniform(0.6, 1.3)
def apply(self, data, **params):
'''
data: ndarray of audio timeseries.
'''
audio_speed_tune = cv2.resize(data, (1, int(len(data)*self.speed_rate))).squeeze()
diff = abs(len(data) - len(audio_speed_tune))
if len(audio_speed_tune) < len(data):
audio_speed_tune = np.r_[np.random.uniform(-0.001, 0.001, int(diff/2)),
audio_speed_tune,
np.random.uniform(-0.001, 0.001, int(np.ceil(diff/2)))]
else:
audio_speed_tune = audio_speed_tune[(diff//2):(diff//2)+len(data)]
return audio_speed_tune
General Usage
audio_path = '../input/birdsong-recognition/train_audio/ameavo/XC133080.mp3'
y,sr = librosa.load(audio_path,sr=22050)
print('Audio Intially')
ipd.Audio(y, rate=sr)
transform = SpeedTuning(p=1.0, speed_rate=1.5)
print('audio after transform')
ipd.Audio(transform(data=y)['data'],rate=sr)
Stretch the audio file under consideration
class StretchAudio(AudioTransform):
""" Do stretching of audio file. """
def __init__(self, always_apply=False, p=0.5, rate=None):
AudioTransform.__init__(self, always_apply, p)
self.rate = rate or np.random.uniform(0.5, 1.5)
def apply(self, data, **params):
'''
data: ndarray of audio timeseries.
'''
init_input_len = len(data)
data = librosa.effects.time_stretch(data, self.rate)
if len(data) > init_input_len:
data = data[:init_input_len]
else:
data = np.pad(data, (0, max(0, init_input_len - len(data))), "constant")
return data
General Usage
audio_path = '../input/birdsong-recognition/train_audio/ameavo/XC292919.mp3'
y,sr = librosa.load(audio_path,sr=22050)
print('Audio Intially')
ipd.Audio(y, rate=sr)
transform = StretchAudio(p=1.0)
print('audio after transform')
ipd.Audio(transform(data=y)['data'],rate=sr)
Shift the pitch of any audio file by number of semitones
class PitchShift(AudioTransform):
"""
Do time shifting of audio.
"""
def __init__(self, always_apply=False, p=0.5, n_steps=None):
AudioTransform.__init__(self, always_apply, p)
'''
nsteps here is equal to number of semitones.
'''
self.n_steps = n_steps
def apply(self, data, **params):
'''
data: ndarray of audio timeseries.
'''
return librosa.effects.pitch_shift(data, sr=22050, n_steps=self.n_steps)
General Usage
audio_path = '../input/birdsong-recognition/train_audio/ameavo/XC292919.mp3'
y,sr = librosa.load(audio_path,sr=22050)
print('Audio Intially')
ipd.Audio(y, rate=sr)
transform = PitchShift(p=1.0,n_steps=4)
print('audio after transform')
ipd.Audio(transform(data=y)['data'],rate=sr)
Add Gaussian Noise to the audio
class AddGaussianNoise(AudioTransform):
""" Add GaussianNoise to the audio """
def __init__(self, always_apply=False, p=0.5):
AudioTransform.__init__(self, always_apply, p)
def apply(self, data, **params):
'''
data: ndarray of audio timeseries.
'''
noise = np.random.randn(len(data))
data_wn = data + 0.005 * noise
return data_wn
General Usage
audio_path = '../input/birdsong-recognition/train_audio/ameavo/XC292919.mp3'
y,sr = librosa.load(audio_path,sr=22050)
print('Audio Intially')
ipd.Audio(y, rate=sr)
transform = AddGaussianNoise(p=1.0)
print('audio after transform')
ipd.Audio(transform(data=y)['data'],rate=sr)
Add audio from any file as cutom noise to the audio under consideration . I have audio from Free audio Tagging competition to show as an example
class AddCustomNoise(AudioTransform):
"""
This Function allows you to add noise from any custom file you want just give path to the directory where the files
are stored and you are good to go.
"""
def __init__(self,file_dir, always_apply=False, p=0.5 ):
super(AddCustomNoise, self).__init__(always_apply, p)
'''
file_dir must be of form '.../input/.../something'
'''
self.noise_files = glob.glob(file_dir+'/*')
def apply(self,data,**params):
'''
data : ndarray of audio timeseries
'''
nf = self.noise_files[int(np.random.uniform(0,len(self.noise_files)))]
noise,_ = librosa.load(nf)
if len(noise)>len(data):
start_ = np.random.randint(len(noise)-len(data))
noise = noise[start_ : start_+len(data)]
else:
noise = np.pad(noise, (0, len(data)-len(noise)), "constant")
data_wn= data + noise
return data_wn
General Usage
audio_path = '../input/birdsong-recognition/train_audio/ameavo/XC292919.mp3'
y,sr = librosa.load(audio_path,sr=22050)
print('Audio Intially')
ipd.Audio(y, rate=sr)
transform = AddCustomNoise(file_dir='../input/freesound-audio-tagging/audio_train',p=1.0)
print('audio after transform')
ipd.Audio(transform(data=y)['data'],rate=sr)
class PolarityInversion(AudioTransform):
def __init__(self, always_apply=False, p=0.5):
AudioTransform.__init__(self, always_apply, p)
def apply(self, data, **params):
return -data
General Usage
audio_path = '../input/birdsong-recognition/train_audio/ameavo/XC292919.mp3'
y,sr = librosa.load(audio_path,sr=22050)
print('Audio Intially')
ipd.Audio(y, rate=sr)
transform = PolarityInversion(p=1.0)
print('audio after transform')
ipd.Audio(transform(data=y)['data'],rate=sr)
class Gain(AudioTransform):
"""
Multiply the audio by a random amplitude factor to reduce or increase the volumn.
This technique can help a model become somewhat invariat to the overall gain of the input audio.
"""
def __init__(self, min_gain_in_db=-12, max_gain_in_db=12, always_apply=False, p=0.5):
AudioTransform.__init__(self, always_apply, p)
assert min_gain_in_db <= max_gain_in_db
self.min_gain_in_db = min_gain_in_db
self.max_gain_in_db = max_gain_in_db
def apply(self, data, **args):
amplitude_ratio = 10**(random.uniform(self.min_gain_in_db, self.max_gain_in_db)/20)
return data * amplitude_ratio
audio_path = '../input/birdsong-recognition/train_audio/ameavo/XC292919.mp3'
y,sr = librosa.load(audio_path,sr=22050)
print('Audio Intially')
ipd.Audio(y, rate=sr)
transform = Gain(p=1.0,max_gain_in_db=-100,min_gain_in_db=-900)
print('audio after transform')
ipd.Audio(transform(data=y)['data'],rate=sr)
Cut-out is famous augmentation for images where it is used to make the model generalize better , in this , a random portion of image pixels are given a value zero
class CutOut(AudioTransform):
def __init__(self, always_apply=False, p=0.5):
AudioTransform.__init__(self, always_apply, p)
def apply(self, data, **params):
start_ = np.random.randint(0, len(data))
end_ = np.random.randint(start_, len(data))
data[start_:end_] = 0
return data
audio_path = '../input/birdsong-recognition/train_audio/ameavo/XC292919.mp3'
y,sr = librosa.load(audio_path,sr=22050)
print('Audio Intially')
ipd.Audio(y, rate=sr)
transform = CutOut(p=1.0)
print('audio after transform')
ipd.Audio(transform(data=y)['data'],rate=sr)
So Far we have seen how the transforms can be used separately , now lets see how to use them with dataset class of pytorch just like we do for Computer vision Tasks
I will be using dataset used in Public kernels now it's for You to decide whether you want to apply transforms before clip cropping or after clip cropping
import albumentations
def get_train_transforms():
return albumentations.Compose([
TimeShifting(p=0.9), # here not p=1.0 because your nets should get some difficulties
albumentations.OneOf([
AddCustomNoise(file_dir='../input/freesound-audio-tagging/audio_train', p=0.8),
SpeedTuning(p=0.8),
]),
AddGaussianNoise(p=0.8),
PitchShift(p=0.5,n_steps=4),
Gain(p=0.9),
PolarityInversion(p=0.9),
StretchAudio(p=0.1),
])
from torch.utils.data import Dataset
class DatasetRetriever(Dataset):
def __init__(
self,
file_list,
waveform_transforms=None):
self.file_list = file_list # list of list: [file_path, ebird_code]
self.waveform_transforms = waveform_transforms
def __len__(self):
return len(self.file_list)
def __getitem__(self, idx: int):
wav_path, ebird_code = self.file_list[idx]
y, sr = librosa.load(wav_path)
if self.waveform_transforms:
y = self.waveform_transforms(data=y)['data']
else:
len_y = len(y)
effective_length = sr * PERIOD
if len_y < effective_length:
new_y = np.zeros(effective_length, dtype=y.dtype)
start = np.random.randint(effective_length - len_y)
new_y[start:start + len_y] = y
y = new_y.astype(np.float32)
elif len_y > effective_length:
start = np.random.randint(len_y - effective_length)
y = y[start:start + effective_length].astype(np.float32)
else:
y = y.astype(np.float32)
#labels = np.zeros(len(BIRD_CODE), dtype="f")
#labels[BIRD_CODE[ebird_code]] = 1
return {"waveform": y}
from pathlib import Path
tmp_list = []
ebird_d = Path('../input/birdsong-resampled-train-audio-00/aldfly')
for wav_f in ebird_d.iterdir():
tmp_list.append([ebird_d.name, wav_f.name, wav_f.as_posix()])
train_wav_path = pd.DataFrame(
tmp_list, columns=["ebird_code", "resampled_filename", "file_path"])
del tmp_list
train_file_list = train_wav_path[["file_path", "ebird_code"]].values.tolist()
from tqdm import tqdm
dataset = DatasetRetriever(file_list=train_file_list, waveform_transforms=get_train_transforms())
for albumentation_text in tqdm(dataset, total=len(dataset)):
pass
Please note that this method can easily be extended by anyone to write his own augmentations , I have just shown a way.
I hope you enjoyed reading it as much as I did writing it